library(tidyverse)
library(magrittr)
library(limma)
library(edgeR)
library(ggfortify)
library(ngsReports)
library(AnnotationHub)
library(readxl)
library(scales)
library(ggpubr)
library(cowplot)
library(ggrepel)
library(msigdbr)
library(pheatmap)
library(ggeasy)
library(fgsea)
library(harmonicmeanp)
library(goseq)
library(pathview)
library(RColorBrewer)
library(UpSetR)
library(pander)
library(kableExtra)

theme_set(theme_bw())

Introduction

The following analysis will compare the effect of heterozygosity for an early-onset familial Alzheimer’s disease-like (EOfAD-like) mutation, or a familial Hidradenitis supurativa (fHS-like) mutation in the presenilin 1 (PSEN1) gene on the brain transcriptome of zebrafish at 6 months of age. A family of zebrafish were generated, which contained fish with psen1 genotypes: EOfAD-like/+, fHS-like/+, +/+ and EOfAD-like/fHS-like (transheterozygous). These fish were sacrificed at 6 months of age, their brains removed and RNA was extracted for RNA-seq. Genomic DNA was extracted from tails of these fish for genotyping by PCR, while sex was determined by inspection of body shape and innards. n = 4 fish per genotype and sex were actually used in the RNA seq experiment.

mRNA was prepared for RNA-seq by SAHMRI. polyA+, stranded libraries were generated. Also, uniique molecular identifers (UMIs) were added to the RNA moleculaes to reduce noise due to PCR duplicates. These UMIs were added to each read using fastp. Then, reads were aligned to the zebrafish genome (GRCz11, Ensembl release 98) using STAR (v2.7.0d). Gene expression values were counted using featureCounts.

Fish metadata

meta <- 
  read_excel("meta_fishInfo_2.xlsx") %>% 
  dplyr::filter(!is.na(Position_plate_seq)) %>% 
  dplyr::select(fish_id, batch_killed, Genotype, Sex, RINe, Position_plate_seq) %>% 
  left_join(read_delim("featureCounts_umidedup_release98.txt", 
                       delim = "\t", skip = 1) %>%
              set_names(basename(names(.))) %>% 
              as.data.frame() %>% 
              colnames() %>% 
              .[grepl(., pattern = "KB")] %>% 
              as_tibble() %>% 
              set_colnames("bamName") %>% 
              mutate(Position_plate_seq  = str_extract(string = bamName, 
                                                       pattern = '[:alpha:]{1}[:digit:]+')) ) %>% 
  dplyr::filter(!is.na(bamName)) %>% 
  mutate(Genotype = str_replace(Genotype, 
                                pattern = "AI-", 
                                replacement = "fAI-"),
         sample = paste0(Genotype, "_", Sex, "_",fish_id), 
         Genotype = factor(Genotype, levels = c("WT", "EOfAD-like/+", "fAI-like/+")), 
         Genotype_2 = case_when(
           Genotype == "EOfAD-like/+" ~ "T428del/+", 
           Genotype == "fAI-like/+" ~ "W233fs/+", 
          TRUE ~ "WT"
         ), 
         Genotype_2 = factor(Genotype_2, levels = c("WT", "T428del/+", "W233fs/+")))

meta %>% 
  dplyr::select(fish_id, Genotype, Sex, RINe) %>% 
  kable(caption = "Sample info") %>% 
  kableExtra::kable_styling("basic", full_width = F)
Sample info
fish_id Genotype Sex RINe
4 WT F 9.4
9 EOfAD-like/+ M 9.0
19 WT F 9.1
25 WT F 9.4
12 fAI-like/+ F 8.7
46 EOfAD-like/+ F 9.2
21 fAI-like/+ M 9.4
50 EOfAD-like/+ M 9.3
55 WT M 9.1
22 fAI-like/+ F 9.3
54 WT M 9.3
26 fAI-like/+ M 9.4
27 EOfAD-like/+ F 9.3
34 WT M 9.0
47 fAI-like/+ F 9.2
29 EOfAD-like/+ M 9.5
35 WT M 9.3
49 fAI-like/+ M 9.5
30 EOfAD-like/+ F 9.6
36 WT F 9.7
44 fAI-like/+ M 9.8
37 WT M 9.3
45 fAI-like/+ F 9.4
40 EOfAD-like/+ F 9.4

Annotation objects:

ah <- AnnotationHub() %>%
    subset(species == "Danio rerio") %>%
    subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]] # for release 98
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
    width() %>%
    vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]

gcGene <- grTrans %>%
  mcols() %>%
  as.data.frame() %>%
  dplyr::select(gene_id, tx_id, gc_content, length) %>%
  as_tibble() %>%
  group_by(gene_id) %>%
  summarise(
    gc_content = sum(gc_content*length) / sum(length),
    length = ceiling(median(length))
  )
grGenes <- genes(ensDb)
mcols(grGenes) %<>%
  as.data.frame() %>%
  left_join(gcGene) %>%
  as.data.frame() %>%
  DataFrame()

NGS reports

fastqc_raw <- list.files(
    path = "fastqc_raw/",
    pattern = "zip", 
    recursive = TRUE,
    full.names = TRUE
    ) %>% 
    FastqcDataList()

Note that the duplicated number of reads was minimised as we have deduplicaated PCR duplicates from the Unique Molecular Identifiers (UMIs).

plotReadTotals(fastqc_raw)
*Library Sizes for each paired end read for each sample before any processing was undertaken.*

Library Sizes for each paired end read for each sample before any processing was undertaken.

GC content was also assessed. The distribution in the plot below is similar to distributions we have observed in previous polyA+ RNA-seq experiments.

plotGcContent(
  x = fastqc_raw, 
  plotType = "line",
  gcType = "Transcriptome",
  species = "Drerio"
) +
  theme(legend.position = "none")

Filtering lowly expressed genes

Genes which are lowly expressed are uninformative for DE analysis. I will follow the 10/min.lib.size rule to filter lowly expressed genes. The effect of filtering is be shown in the density plots below.

minCPM <- 0.1
minSamples <- 12

featureCounts <- 
  read_delim("featureCounts_umidedup_release98.txt", delim = "\t", skip = 1) %>%
  set_names(basename(names(.))) %>% 
  as.data.frame() %>%
    dplyr::select(-c(Chr, Start, End, Length, Strand)) %>% 
    gather(key = "bamName", value = "counts", contains("KB")) %>% 
    left_join(meta) %>% 
    dplyr::select(Geneid, counts, sample) %>% 
    spread(key = "sample", value = "counts") %>% 
    column_to_rownames("Geneid")
  
a <- featureCounts %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>% 
  pivot_longer(
    cols = everything(),
    names_to = "sample",
    values_to = "logCPM"
  ) %>%
  split(f = .$sample) %>%
  lapply(function(x){
    d <- density(x$logCPM)
    tibble(
      sample = unique(x$sample),
      x = d$x,
      y = d$y
    )
  }) %>%
  bind_rows() %>%
  left_join(meta) %>% 
  ggplot(aes(x, y, colour = Genotype, group = sample)) +
  geom_line() +
  labs(
    x = "logCPM",
    y = "Density",
    colour = "Genotype"
  )+
  ggtitle("Before filtering") +
  theme_bw()

b <- featureCounts %>% 
  .[rowSums(cpm(.) >= 1) >= minSamples,] %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>% 
  pivot_longer(
    cols = everything(),
    names_to = "sample",
    values_to = "logCPM"
  ) %>%
  split(f = .$sample) %>%
  lapply(function(x){
    d <- density(x$logCPM)
    tibble(
      sample = unique(x$sample),
      x = d$x,
      y = d$y
    )
  }) %>%
  bind_rows() %>%
  left_join(meta) %>% 
  ggplot(aes(x, y, colour = Genotype, group = sample)) +
  geom_line() +
  labs(
    x = "logCPM",
    y = "Density",
    colour = "Genotype"
  )+
  ggtitle("After filtering") +
  theme_bw()

ggarrange(a, b, common.legend = TRUE)

DGE object

x <- featureCounts %>% 
  as.matrix() %>% 
  .[rowSums(cpm(.) >= minCPM) >= minSamples,] %>%
  DGEList(
    samples = tibble(sample = colnames(.)) %>%
      left_join(meta),
    genes = grGenes[rownames(.)] %>%
      as.data.frame() %>%
      dplyr::select(
        chromosome = seqnames, start, end, 
        gene_id, gene_name, gene_biotype, description, entrezid
      ) %>% 
      left_join(gcGene) %>% 
      as_tibble()
  ) %>%
  calcNormFactors()

Gene-level count data as output by featureCounts, was loaded and formed into a DGEList object. During this process, genes were removed if they were not considered as detectable (CPM < 0.1 in > 12 samples). This translates to > 7 reads assigned a gene in at least half of the RNA seq samples..

These filtering steps returned gene-level counts for 22,005 genes, with total library sizes between 69,268,551, 109,940,513 reads assigned to genes.

Genotype checks

For the purposes of genotype checking, the umi-dedup .bam files were converted back to paired fastq files, then were pseudo-aligned to a modified version of the zebrafish trannscriptome (GRCz11, cDNA , primary assembly). This transcritome manually included the psen1 transcripts with the mutations.

transcounts <- list.files("kallisto/", full.names = TRUE) %>% 
  catchKallisto()
## Reading kallisto//1_KB_A10_HTVLLDRXX_AACTCGGA-GTCTTGCA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//10_KB_B6_HTVLLDRXX_GCACACAA-GATTGCTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//11_KB_B7_HTVLLDRXX_CTCCTAGT-AGATGAGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//12_KB_B8_HTVLLDRXX_TCTTCGAC-GAGCAGTA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//13_KB_B9_HTVLLDRXX_GACTACGA-GTCGGTAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//14_KB_C10_HTVLLDRXX_CCAACACT-AGTTCGTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//15_KB_C11_HTVLLDRXX_TCTAGGAG-CGTTGCAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//16_KB_C12_HTVLLDRXX_CTCGAACA-TTCGTTGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//17_KB_C1_HTVLLDRXX_ACCATCCT-GTTCATGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//18_KB_C2_HTVLLDRXX_CGTCCATT-AGGAGGAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//19_KB_C3_HTVLLDRXX_AACTTGCC-GAAGTTGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//2_KB_A11_HTVLLDRXX_GTCGAGAA-GCTGTAAG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//20_KB_C4_HTVLLDRXX_GTACACCT-TTGGTCTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//21_KB_C5_HTVLLDRXX_ACGAGAAC-CGAACTGT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//22_KB_C6_HTVLLDRXX_CGACCTAA-CAGGTTAG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//23_KB_C7_HTVLLDRXX_TACATCGG-TGATCGGA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//24_KB_C8_HTVLLDRXX_ATCGTCTC-GTCCTTCT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//3_KB_A1_HTVLLDRXX_CGCTACAT-CGTAGGTT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//4_KB_A6_HTVLLDRXX_AATCCAGC-TAGGATGC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//5_KB_A7_HTVLLDRXX_CGTCTAAC-ACTCGTTG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//6_KB_B11_HTVLLDRXX_ACTCCTAC-AGTTACGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//7_KB_B12_HTVLLDRXX_CTTCCTTC-ACCTGGAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//8_KB_B3_HTVLLDRXX_ACAACAGC-TTGTCGGT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//9_KB_B5_HTVLLDRXX_ATGACAGG-TGGCATGT_merged.umidup, 51752 transcripts, 100 bootstraps
colnames(transcounts$counts) %<>% 
  basename() %>% 
  str_extract(pattern = '[:alpha:]{1}[:digit:]+')

psen1_txids <- grTrans %>% 
  as_tibble() %>% 
  dplyr::filter(gene_id == "ENSDARG00000004870") %>% 
  .$tx_id_version

psen1_txids %<>% 
  sapply(function(x) {
    x %>% 
      paste0("_W233fs")
  }) %>% 
  unlist() %>% 
  as_tibble() %>% 
  bind_rows(psen1_txids %>% 
  sapply(function(x) {
    x %>% 
      paste0("_T428del")
  }) %>% 
  unlist() %>% 
  as_tibble() 
  ) %>% 
  bind_rows(psen1_txids %>% 
              as_tibble)

transcounts$counts %>% 
  cpm() %>% 
  as.data.frame() %>%
  rownames_to_column("tx_id") %>% 
  as_tibble() %>% 
  dplyr::filter(tx_id %in% psen1_txids$value) %>% 
  gather(key = "Position_plate_seq", value = "CPM", !starts_with("tx")) %>% 
  mutate(psen1 = case_when(
    grepl(tx_id, pattern = "T428del") ~ "T428del", 
    grepl(tx_id, pattern = "W233fs") ~ "W233fs",
    TRUE ~ "WT"
  )) %>% 
  group_by(Position_plate_seq, psen1) %>%
  summarise(CPM = sum(CPM)) %>% 
  left_join(meta) %>% 
  ggplot(aes(Position_plate_seq, CPM, fill = psen1)) +
  geom_bar(stat = "identity", colour = "black") +
  scale_fill_viridis_d()+
  theme_bw() +
  facet_wrap(~Genotype, scales = "free_x")

This didnt really work. So I manually inspected the alignments using IGV and counted the number of reads aligning to the WT and mutant sequences at each site. For the T428 site, I took the average number of reads for the 3 codons, for the W233 site, it was the avergae number of reads for the 2 codons.

psen1_gene_id <- "ENSDARG00000004870"

ggarrange(
  cpm(x, log = TRUE) %>% 
  as.data.frame() %>% 
  .[psen1_gene_id,] %>% 
  gather(key = "sample", value = "logCPM") %>% 
  left_join(x$samples) %>% 
  ggplot(aes(x = Genotype, y = logCPM, fill = Genotype)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(shape = Sex), 
              size = 3,  colour = "black") +
  theme_bw() +
  easy_rotate_x_labels(angle = -45) +
    ggtitle("psen1 expression") + 
  labs(x = "Genotype", 
       fill = "Genotype"),
ggarrange( 
  read_excel("numreads_psen1Sites.xlsx") %>% 
    left_join(x$samples) %>% 
    mutate(total_number_reads_T428del = T428_wt + T428del_reads, 
           total_number_reads_W233fs = W233_wt + W233fs_reads) %>% 
    ggplot(aes(x = sample, y = total_number_reads_T428del)) +
    geom_col(fill = "#350463") +
    geom_col(aes(y = T428del_reads), fill = "#1F968CFF") +
    facet_wrap(~Genotype, scales = "free_x") +
    theme_bw() +
    labs(x = "Sample", 
         y = "Number of reads") +
    easy_rotate_x_labels(angle = -90) +
    ggtitle("Number of reads aligning to the T428 site"),
  
  read_excel("numreads_psen1Sites.xlsx") %>% 
    left_join(x$samples) %>% 
    mutate(total_number_reads_T428del = T428_wt + T428del_reads, 
           total_number_reads_W233fs = W233_wt + W233fs_reads) %>% 
    ggplot(aes(x = sample, y = total_number_reads_W233fs)) +
    geom_col(fill = "#350463") +
    geom_col(aes(y = W233fs_reads), fill = "#FDE725FF") +
    facet_wrap(~Genotype, scales = "free_x") +
    theme_bw() +
    labs(x = "Sample", 
         y = "Number of reads") +
    easy_rotate_x_labels(angle = -90) +
    ggtitle("Number of reads aligning to the W233 site"), 
  labels = c("B", "C"),
  ncol = 1, 
  nrow = 2),
  common.legend = F, 
labels = c("A", "")
) 

read_excel("numreads_psen1Sites.xlsx") %>% 
  left_join(x$samples) %>% 
  gather(key = "allele", value = "numReads", 2:5) %>% 
  mutate(scaled_reads = numReads*norm.factors, 
         half_scaled_reads = scaled_reads/2) %>% 
  dplyr::filter(allele %in% c("W233_wt")) %>% 
  ggplot(aes(x = Genotype)) +
  geom_boxplot(aes(fill = Genotype, y = scaled_reads), 
               outlier.shape = NA) +
  geom_point(aes(shape = Sex, y = scaled_reads),
             size= 4, 
             position = position_jitter(seed = 1, height = 0, width = 0.2)) +
  # add the secod layer of points
  geom_boxplot(aes(fill = Genotype, y = half_scaled_reads), 
               outlier.shape = NA,
               alpha = 0.5,
               data = . %>% dplyr::filter(Genotype == "WT")) +
  geom_point(aes(shape = Sex, y = half_scaled_reads), 
             size= 4, 
             position = position_jitter(seed = 1, height = 0, width = 0.2),
             alpha = 0.5,
             data = . %>% dplyr::filter(Genotype == "WT")
  ) +
  stat_summary(aes(y = half_scaled_reads), 
               data = . %>% dplyr::filter(Genotype == "WT"),
               fun = "mean", geom = "point", shape= 23, size= 3, alpha = 0.5, fill= "red") +
  stat_summary(aes( y = scaled_reads), 
               fun = "mean", geom = "point", shape= 23, size= 3, fill= "red") +
  theme_bw() +
  scale_y_continuous(limits = c(20, 180)) +
  labs(y = "Number of reads aligning to the wild type W233 site\n(scaled by RNA-seq library size)") +
  ggtitle("Number of reads aligning to the WT W233 site") 

# t.test(x = read_excel("numreads_psen1Sites.xlsx") %>% 
#          left_join(x$samples) %>% 
#            gather(key = "allele", value = "numReads", 2:5) %>% 
#            mutate(scaled_reads = numReads*norm.factors) %>%  
#            dplyr::filter(allele %in% c("W233_wt"), 
#                          Genotype == "fAI-like/+") %>% 
#            .$scaled_reads, 
#          y = read_excel("numreads_psen1Sites.xlsx") %>% 
#            left_join(x$samples) %>% 
#            gather(key = "allele", value = "numReads", 2:5) %>% 
#            mutate(scaled_reads = numReads*norm.factors) %>%  
#            dplyr::filter(allele %in% c("W233_wt"), 
#                          Genotype == "WT") %>% 
#            .$scaled_reads/2
#         )

PCA

First, I will explore the overall similarity between samples using principal component analysis. The first two principal components explained less of the total variability than one would expxect. No clear seperatation between samples is observed of Genotype or Sex.

ggarrange(
x$counts %>% 
  cpm(log=TRUE) %>% 
  t() %>% 
  prcomp() %>% 
  autoplot(data = tibble(sample = rownames(.$x)) %>%
      left_join(x$samples),
    colour = "Genotype", 
    shape = "Sex", 
    size = 6
  ) +
  theme_bw() + 
  theme(aspect.ratio = 1) +
  labs(colour = "Genotype") ,
  
x$counts %>% 
  cpm(log=TRUE) %>% 
  t() %>% 
  prcomp() %>% 
  summary() %>% 
  .$importance %>% 
  as.data.frame() %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column("PC") %>% 
  mutate(`Proportion of Variance` = `Proportion of Variance`*100, 
         `Cumulative Proportion` = `Cumulative Proportion`*100) %>% 
  ggplot(aes(x = reorder(PC, -`Proportion of Variance`))) +
  geom_col(aes( y = `Proportion of Variance`)) +
  labs(y = "Proportion of Variance (%)", 
       x = "Principal Component") +
  geom_point(aes(y = `Cumulative Proportion`))+
  geom_line(aes(y = `Cumulative Proportion`, group = 1), show.legend = TRUE) +
  theme_bw() +
  theme(aspect.ratio = 1) +
  rotate_x_text(angle = -45),
common.legend = FALSE, 
labels = "AUTO"
) 

  # ggsave("plots/PCA_scree.png", width = 18, height = 10, units = "cm", dpi = 800, scale = 1.5)

I also want to see if any technical artefacts correlate with PC1 or PC2 (i.e. RNA-seq library size or RNA quality). No observed correlations are presnt.

ggarrange(
x$counts %>% 
  cpm(log = TRUE) %>% 
  t() %>% 
  prcomp %>% 
  .$x %>% 
  as.data.frame() %>% 
  rownames_to_column("sample") %>% 
  left_join(x$samples) %>% 
  mutate(`Library Size (millions) ` = lib.size/1e6) %>% 
  ggplot(aes(x = PC1, y = `Library Size (millions) `)) +
  geom_point(aes( colour = Genotype, shape = Sex), 
             size = 6) +
  geom_smooth(method = "lm") +
  labs(colour = "Genotype") +
  theme_bw(),

x$counts %>% 
  cpm(log = TRUE) %>% 
  t() %>% 
  prcomp %>% 
  .$x %>% 
  as.data.frame() %>% 
  rownames_to_column("sample") %>% 
  left_join(x$samples) %>% 
  mutate(`Library Size (millions) ` = lib.size/1e6) %>% 
  ggplot(aes(x = PC2, y = `Library Size (millions) `)) +
  geom_point(aes( colour = Genotype, shape = Sex), 
             size = 6) +
  geom_smooth(method = "lm") +
  labs(colour = "Genotype") +
  theme_bw(),
common.legend = TRUE, 
labels = "AUTO"
) 

  #ggsave("plots/PC1vLibsize.png", height = 4)
ggarrange(
x$counts %>% 
  cpm(log = TRUE) %>% 
  t() %>% 
  prcomp %>% 
  .$x %>% 
  as.data.frame() %>% 
  rownames_to_column("sample") %>% 
  left_join(x$samples) %>% 
  ggplot(aes(x = PC1, y = RINe)) +
  geom_point(aes( colour = Genotype, shape = Sex), 
             size = 6) +
  geom_smooth(method = "lm") +
  theme_bw(), 
x$counts %>% 
  cpm(log = TRUE) %>% 
  t() %>% 
  prcomp %>% 
  .$x %>% 
  as.data.frame() %>% 
  rownames_to_column("sample") %>% 
  left_join(x$samples) %>% 
  ggplot(aes(x = PC2, y = RINe)) +
  geom_point(aes( colour = Genotype, shape = Sex), 
             size = 6) +
  geom_smooth(method = "lm") +
  theme_bw(), 
common.legend = T
)

Initial Differential expression

design matrix

design <- model.matrix(~Genotype, data = x$samples) %>% 
  set_colnames(str_remove(colnames(.), pattern = "Genotype"))

design %>% 
  as.data.frame() %>% 
  pheatmap(cluster_rows = FALSE, cluster_cols = FALSE, 
           color = brewer_pal(palette = "Set1")(2), 
           cellheight = 15, cellwidth = 15)
Visualisation of design matrix

Visualisation of design matrix

fit_1 <- x %>% 
  estimateDisp(design) %>% 
  glmFit(design)

toptable_1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(x){
    glmLRT(fit_1, coef = x) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as_tibble() %>%
      arrange(PValue) %>%
       mutate(
        coef = x,
        DE = FDR < 0.05
      ) %>% 
      dplyr::select(
        gene_name, logFC, logCPM, PValue, FDR, DE, everything()  
      )
      
  }, simplify = FALSE)

toptable_1 %>% 
  lapply(function(x) {
    x %>% 
      dplyr::filter(DE == TRUE) 
  }) %>% 
  bind_rows() %>% 
  dplyr::select(coef, gene_name, logFC, logCPM, PValue, FDR, DE) %>% 
  kable(caption = "DE genes due to each mutation relative to WT") %>% 
  kable_styling()
DE genes due to each mutation relative to WT
coef gene_name logFC logCPM PValue FDR DE
EOfAD-like/+ col12a1a 1.8959094 3.8394694 0.00e+00 0.0000000 TRUE
EOfAD-like/+ si:ch211-235i11.5 0.6904772 2.8948813 0.00e+00 0.0000000 TRUE
EOfAD-like/+ ano9b 1.2713444 -1.1769775 0.00e+00 0.0000246 TRUE
EOfAD-like/+ si:ch211-235i11.4 0.3423547 3.5708023 9.00e-07 0.0039792 TRUE
EOfAD-like/+ CR318588.1 3.3926411 2.4513599 1.00e-06 0.0039792 TRUE
EOfAD-like/+ myhb 4.1004025 2.3851026 1.10e-06 0.0039792 TRUE
EOfAD-like/+ si:ch211-235i11.6 0.5029123 1.8847032 1.70e-06 0.0050857 TRUE
EOfAD-like/+ adi1 0.5302238 2.7029554 1.80e-06 0.0050857 TRUE
EOfAD-like/+ CR318588.3 2.0624035 2.7489762 3.00e-06 0.0073193 TRUE
EOfAD-like/+ CABZ01068209.1 -0.9723557 -0.9288210 9.80e-06 0.0216029 TRUE
EOfAD-like/+ mov10b.2 1.3400381 -1.8722367 1.57e-05 0.0306411 TRUE
EOfAD-like/+ znf982 0.3808937 2.2570512 1.67e-05 0.0306411 TRUE
EOfAD-like/+ stac3 1.5755831 -2.4135388 2.66e-05 0.0450699 TRUE
fAI-like/+ psen1 -0.7992024 5.3419313 0.00e+00 0.0000000 TRUE
fAI-like/+ dglucy -0.9591401 5.0136276 0.00e+00 0.0000316 TRUE
fAI-like/+ sh3pxd2ab -2.9861437 1.0043260 0.00e+00 0.0002255 TRUE
fAI-like/+ fam167aa -3.1575730 0.3483399 1.00e-07 0.0008048 TRUE
fAI-like/+ kcnk17 -1.3778914 -1.1065158 6.00e-06 0.0264798 TRUE

Visualisations

MD plot

toptable_1 %>%
  bind_rows() %>% 
  ggplot(aes(x = logCPM, y = logFC,  colour = DE)) +
  geom_point(
    alpha = 0.5
  ) +
  facet_grid(rows = vars(coef)) +
  theme_bw() +
  geom_label_repel(
    aes(label = gene_name), 
    data = .  %>% dplyr::filter(FDR < 0.05), 
    show.legend = FALSE
    ) +
  theme(legend.position = "none") +
  ggtitle("MD Plots of different psen1 mutant comparisons to WT with TMM normalisation") +
  scale_color_manual(values = c("grey50", "red"))

Volcano plots

toptable_1 %>%
  bind_rows() %>% 
  ggplot(aes(y = -log10(PValue), x = logFC, colour = DE)) +
  geom_point(
    alpha = 0.5
  ) +
  facet_grid(rows = vars(coef)) +
  theme_bw() +
  geom_label_repel(
    aes(label = gene_name), 
    data = .  %>% dplyr::filter(FDR < 0.05), 
    show.legend = FALSE
    ) +
  theme(legend.position = "none") +
  ggtitle("Volcano Plots of different psen1 mutant comparisons to WT with TMM normalisation") +
  coord_cartesian(ylim = c(0, 10)) +
  scale_color_manual(values = c("grey50", "red"))

Upset plot of shared DE genes

toptable_1 %>% 
  lapply(function(y) {
    y %>% 
      dplyr::filter(DE == TRUE) %>% 
      .$gene_name
  }) %>% 
  UpSetR::fromList() %>% 
  UpSetR::upset()

check for GC and length bias

A ranking statistic of the sign of the logFC multiplied by the log10 of the p-value was calculated for each gene. I plotted this ranking statistic against the gc content and length of each gene to see if there are any biases observed. If there was, I would use the cqn normalisation method to correct for this. No apparent biases were observed, therefore cqn is not appropriate.

Note that the limits for the y axis (ranking stat) are zoomed in for visualisation purposes.

ggarrange(
toptable_1 %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = length, y = rankstat)) +
  geom_point(
    aes(colour = DE),
    alpha = 0.5
  ) +
  geom_smooth(se = FALSE, method = "gam") +
  facet_grid(rows = vars(coef)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_color_manual(values = c("grey50", "red")) +
  scale_x_log10()+
  labs(x = "Average transcript length per gene",
       colour = "Differentially expressed?",
       y = "sign(logFC)*-log10(PValue)") +
  coord_cartesian(ylim = c(-10, 10)),

toptable_1 %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = gc_content, y = rankstat)) +
  geom_point(
    aes(colour = DE),
    alpha = 0.5
  ) +
  geom_smooth(se = FALSE, method = "gam") +
  facet_grid(rows = vars(coef)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_color_manual(values = c("grey50", "red")) +
  coord_cartesian(ylim = c(-10,10)) + 
  labs(x = "Weighted average GC content (%) per gene", 
       colour = "Differentially expressed?", 
       y = "sign(logFC)*-log10(PValue)"),

common.legend = TRUE, 
labels = "AUTO"
) 

#  ggsave("plots/gclength.png", width = 18, height = 10, units = "cm", dpi = 600)

Enrichment testing

Gene sets

# GO terms
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
  readRDS() %>%
  mutate(
    Term = Term(id),
    gs_name = Term %>% str_to_upper() %>% str_replace_all("[ -]", "_"),
    gs_name = paste0("GO_", gs_name)
    )
minPath <- 3

# get a mapping file of entrez to ensembl ids
ens2Entrez <- grGenes %>% 
    as_tibble() %>% 
    dplyr::select(gene_id, entrezid) %>% 
    unchop(entrezid) %>% 
    dplyr::filter(gene_id %in% rownames(x)) %>% 
    dplyr::filter(!is.na(entrezid)) 

GO <- msigdbr("Danio rerio", category = "C5") %>% 
  dplyr::filter(grepl(gs_name, pattern = "^GO_")) %>% 
  dplyr::rename(entrezid = entrez_gene) %>% 
  inner_join(ens2Entrez) %>%
  dplyr::distinct(gs_name, gene_id, .keep_all = TRUE) %>% 
  split(f = .$gs_name) %>%
  lapply(extract2, "gene_id")

KEGG <- msigdbr("Danio rerio", category = "C2", subcategory = "CP:KEGG") %>% 
  left_join(x$genes, by = c("gene_symbol" = "gene_name")) %>% 
  dplyr::filter(gene_id %in% rownames(x)) %>% 
  distinct(gs_name, gene_id, .keep_all = TRUE) %>% 
  split(f = .$gs_name) %>%
  lapply(extract2, "gene_id")

ireGenes <- readRDS("zebrafishIreGenes.rds") %>% 
  unlist() %>% 
  as.data.frame() %>% 
  rownames_to_column("ire") %>% 
  as_tibble() %>% 
  mutate(ire = str_extract(ire, pattern = "ire[:digit:]_[:alpha:]+")) %>% 
  set_colnames(c("ire", "gene_id")) %>% 
  dplyr::filter(gene_id %in% rownames(x)) %>% 
  split(f = .$ire) %>% 
  lapply(magrittr::extract2,"gene_id") 

chr <- x$genes %>% 
  dplyr::select(gene_id, chromosome) %>% 
  split(f = .$chromosome) %>%
  lapply(extract2, "gene_id") %>% 
  .[c("MT", seq(1:25))] # omit the ALT chromosmes and scaffolds

goseq

To test for enrichment of GO terms within the DE gene, I used goseq. I used the average transcript length per gene to calculate the PWF.

pwfs <- toptable_1 %>%
        lapply(function(x) {
        x %>% 
            as_tibble() %>% 
            distinct(gene_name, .keep_all = TRUE)
        }) %>% 
  lapply(function(x) {
    x %>% 
      with(
        nullp(
          DEgenes = structure(DE, names = gene_id), 
          bias.data = length
        )
      )
  })

goseq(pwfs$`EOfAD-like/+`, gene2cat = c(GO)) %>% 
  as_tibble() %>% 
  mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
  dplyr::select(-under_represented_pvalue) %>% 
  head(10) %>% 
  kable(caption = "Top 10 overrep GO terms in EOfAD-like/+ brains") %>% 
  kable_styling()
Top 10 overrep GO terms in EOfAD-like/+ brains
category over_represented_pvalue numDEInCat numInCat FDR
GO_SKELETAL_MUSCLE_CONTRACTION 0.0000383 2 32 0.3924105
GO_MULTICELLULAR_ORGANISMAL_MOVEMENT 0.0000774 2 44 0.3963241
GO_STRIATED_MUSCLE_CONTRACTION 0.0008309 2 147 1.0000000
GO_L_METHIONINE_SALVAGE_FROM_METHYLTHIOADENOSINE 0.0014209 1 5 1.0000000
GO_CALCIUM_ACTIVATED_PHOSPHOLIPID_SCRAMBLING 0.0014353 1 4 1.0000000
GO_AMINO_ACID_SALVAGE 0.0017122 1 6 1.0000000
GO_REGULATION_OF_INORGANIC_ANION_TRANSMEMBRANE_TRANSPORT 0.0021386 1 6 1.0000000
GO_MYOSIN_LIGHT_CHAIN_BINDING 0.0022972 1 7 1.0000000
GO_PHOSPHOLIPID_SCRAMBLASE_ACTIVITY 0.0023459 1 7 1.0000000
GO_REGULATION_OF_ANION_CHANNEL_ACTIVITY 0.0023529 1 7 1.0000000
goseq(pwfs$`fAI-like/+`, gene2cat = c(GO)) %>% 
  as_tibble() %>%
  mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
  dplyr::select(-under_represented_pvalue) %>% 
  head(10) %>% 
  kable(caption = "Top 10 overrep GO terms in fAI-like/+ brains") %>% 
  kable_styling()
Top 10 overrep GO terms in fAI-like/+ brains
category over_represented_pvalue numDEInCat numInCat FDR
GO_GAMMA_SECRETASE_COMPLEX 0.0009466 1 4 1
GO_NOTCH_RECEPTOR_PROCESSING_LIGAND_DEPENDENT 0.0011819 1 5 1
GO_ASPARTIC_ENDOPEPTIDASE_ACTIVITY_INTRAMEMBRANE_CLEAVING 0.0011832 1 5 1
GO_CATION_CHANNEL_ACTIVITY 0.0013874 2 276 1
GO_REGULATION_OF_L_GLUTAMATE_IMPORT_ACROSS_PLASMA_MEMBRANE 0.0014149 1 6 1
GO_CHOLINE_TRANSPORT 0.0014174 1 6 1
GO_REGULATION_OF_AMYLOID_FIBRIL_FORMATION 0.0016558 1 7 1
GO_MODULATION_OF_AGE_RELATED_BEHAVIORAL_DECLINE 0.0016559 1 7 1
GO_REGULATION_OF_RESTING_MEMBRANE_POTENTIAL 0.0018891 1 8 1
GO_REGULATION_OF_CORE_PROMOTER_BINDING 0.0018891 1 8 1

ranked lists

Since no GO terms were found to be overepresented in the DE lists, I used the ranked list approach I have used previously.

fryRes1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE) %>% 
      fry(
        index = c(KEGG, ireGenes),
        design = design, 
        contrast = y, 
        sort = "mixed"
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y)
  }, simplify = FALSE)

cameraRes1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    x %>% 
      cpm(log = TRUE) %>% 
      camera(
        index = c(KEGG, ireGenes),
        design = design, 
        contrast = y, 
        inter.gene.cor = NA, 
        sort = TRUE
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y, 
             sig = FDR < 0.05)
  }, simplify = FALSE)

ranks_fgsea1 <- 
   sapply(toptable_1, function(x) {
     x %>% 
       mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>% 
       arrange(PValue_withsign) %>% 
       dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
       with(structure(PValue_withsign, names = gene_id)) %>% 
       rev() # reverse so the start of the list is upregulated genes
   }, simplify = FALSE)

set.seed(2)
fgseaRes1 <- ranks_fgsea1 %>%
  lapply(function(x){
    fgseaMultilevel(stats = x, pathways = c(KEGG, ireGenes)) %>%
      as_tibble() %>%
      dplyr::rename(FDR = padj) %>%
      mutate(padj = p.adjust(pval, "bonferroni")) %>%
      dplyr::select(pathway, pval, FDR, padj, everything()) %>%
      arrange(pval) %>%
      mutate(sig = padj < 0.05)
  })

fgseaRes1$`EOfAD-like/+` %<>% 
  mutate(coef = "EOfAD-like/+" )

fgseaRes1$`fAI-like/+` %<>% 
  mutate(coef = 'fAI-like/+')

HMP_1 <- fryRes1 %>% 
  bind_rows() %>% 
  dplyr::select(pathway, PValue.Mixed, coef) %>% 
  dplyr::rename(fry_p = PValue.Mixed) %>% 
  left_join(cameraRes1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, PValue, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(camera_p = PValue) %>% 
  left_join(fgseaRes1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, pval, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(fgsea_p = pval) %>% 
  bind_rows() %>% 
  nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>% 
 mutate(harmonic_p = vapply(p, function(x){
        x <- unlist(x)
        x <- x[!is.na(x)]
        p.hmp(x, L = 4)
      }, numeric(1))
    ) %>% 
  unnest() %>% 
  mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
         sig = harmonic_p_FDR < 0.05) %>% 
  arrange(harmonic_p)

Enrichment vis

heatmap

sigPaths <- 
  HMP_1 %>% 
  dplyr::filter(harmonic_p_FDR < 0.05) %>% 
  dplyr::filter(grepl(pathway, pattern = "KEGG")) %>% 
  .$pathway %>% 
  unique

sig_in <- 
  HMP_1 %>% 
  dplyr::select(pathway, sig, coef) %>% 
  spread(key = 'coef', value = 'sig') %>% 
  mutate(sig_in = case_when(
    `EOfAD-like/+` == "TRUE" & `fAI-like/+` == "TRUE" ~ 'both', 
    `EOfAD-like/+` == FALSE & `fAI-like/+` == "TRUE" ~ 'fHS only', 
    `EOfAD-like/+` == "TRUE" & `fAI-like/+` == FALSE ~ 'EOfAD only', 
    TRUE ~ 'neither'
  )) %>% 
  dplyr::select(pathway, sig_in)

HMP_1 %>% 
  dplyr::filter(pathway %in% sigPaths) %>% 
  left_join(sig_in) %>% 
  ggplot(aes(y = coef, x = pathway)) +
  geom_tile(aes(fill = -log10(harmonic_p)), colour = 'black'
              ) +
  geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)), 
            colour = "white") +
  geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)), 
            data = . %>% 
              dplyr::filter(pathway == "KEGG_NOTCH_SIGNALING_PATHWAY" &
                              coef == "fAI-like/+"), 
            colour = "black") +
  theme_bw() +
  easy_rotate_x_labels(angle = -90) +
  theme(panel.grid = element_blank(), 
        panel.border = element_blank(), 
        strip.background = element_blank(), 
        strip.text = element_blank()) + 
  labs(x = "", y = "") +
  facet_grid(cols = vars(sig_in), scales = 'free', shrink = FALSE, space = 'free') + 
  scale_fill_viridis_c(option = 'viridis')

upset plotsof leading edge genes

# upset leading edeg EOfAD    
fgseaRes1$`EOfAD-like/+` %>% 
  dplyr::filter(pathway %in%c(HMP_1 %>% 
                                dplyr::filter(coef == "EOfAD-like/+" & sig == TRUE) %>%
                                .$pathway)
                ) %>% 
  dplyr::select(pathway, leadingEdge) %>% 
  unnest(cols = c(leadingEdge)) %>% 
  split(f = .$pathway) %>% 
  lapply(magrittr::extract2, "leadingEdge") %>% 
  UpSetR::fromList() %>% 
  UpSetR::upset(order.by = "freq", nsets = length(.))

# upset leading edeg fHS
fgseaRes1$`fAI-like/+` %>% 
  dplyr::filter(pathway %in%c(HMP_1 %>% 
                                dplyr::filter(coef == "fAI-like/+" & sig == TRUE) %>%
                                .$pathway)
                ) %>% 
  # shorten this name for vis purposes
  mutate(pathway = str_replace(pathway, pattern = "KEGG_ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC", 
                               replacement = "KEGG_ARRHYTHMOGENIC_RIGHT_VENT...")) %>% 
  dplyr::filter(grepl(pathway, pattern = "KEGG")) %>% 
  dplyr::select(pathway, leadingEdge) %>% 
  unnest(cols = c(leadingEdge)) %>% 
  split(f = .$pathway) %>% 
  lapply(magrittr::extract2, "leadingEdge") %>% 
  UpSetR::fromList() %>% 
  UpSetR::upset(order.by = "freq", nsets = length(.) )

heatmaps of leading edges

#png("plots/cyto_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>% 
  bind_rows() %>% 
  dplyr::filter(pathway == "KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION") %>% 
  dplyr::select(leadingEdge, coef) %>% 
  set_colnames(c("gene_id", "coef")) %>% 
  unnest(cols = c(gene_id)) %>% 
  left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>% 
  dplyr::select(gene_name, coef, logFC) %>% 
  dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_name") %>% 
  pheatmap(cluster_cols = F,cluster_rows = F, 
           na_col = "grey80",
           cellwidth = 12, cellheight = 12,
           color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100), 
  )

# dev.off()
# 
# png("plots/jakstat_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>% 
  bind_rows() %>% 
  dplyr::filter(pathway == "KEGG_JAK_STAT_SIGNALING_PATHWAY") %>% 
  dplyr::select(leadingEdge, coef) %>% 
  set_colnames(c("gene_id", "coef")) %>% 
  unnest(cols = c(gene_id)) %>% 
  left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>% 
  dplyr::select(gene_name, coef, logFC) %>% 
  dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_name") %>% 
  pheatmap(cluster_cols = F,cluster_rows = F, 
           na_col = "grey80",
           cellwidth = 12, cellheight = 12,
           color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100), 
  )

#dev.off()

#png("plots/ribo_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>% 
  bind_rows() %>% 
  dplyr::filter(pathway == "KEGG_RIBOSOME") %>% 
  dplyr::select(leadingEdge, coef) %>% 
  set_colnames(c("gene_id", "coef")) %>% 
  unnest(cols = c(gene_id)) %>% 
  left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>% 
  dplyr::select(gene_name, coef, logFC) %>% 
  dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_name") %>% 
  pheatmap(cluster_cols = F,cluster_rows = F, 
           na_col = "grey80",
           cellwidth = 12, cellheight = 12,
           color = colorRampPalette(rev(brewer.pal(n = 7, name = "YlGnBu")))(100), 
  )

#dev.off()
toptable_1 %>% 
  bind_rows() %>% 
  dplyr::filter(gene_id %in% KEGG$KEGG_RIBOSOME) %>% 
  dplyr::select(gene_id, coef, logFC) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_id") %>% 
  pheatmap(colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), 
           show_rownames = FALSE, 
           treeheight_row = 0, treeheight_col = 0, 
           breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1), 
              seq(max(.)/100, max(.), length.out=floor(100/2)))
           )

# toptable_Q96 %>% 
#   dplyr::filter(ensembl_gene_id %in% KEGG$KEGG_RIBOSOME) %>% 
#   dplyr::select(ensembl_gene_id, logFC) %>% 
#   column_to_rownames("ensembl_gene_id") %>% 
#   pheatmap(colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), 
#            show_rownames = FALSE, cluster_cols = F,
#            treeheight_row = 0, treeheight_col = 0, 
#            breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1), 
#               seq(max(.)/100, max(.), length.out=floor(100/2)))
#            )
annocol_2 <- KEGG$KEGG_PATHWAYS_IN_CANCER %>% 
  as.data.frame() %>% 
  set_colnames("gene_id") %>% 
  mutate(`EOfAD-like/+` = case_when(
    gene_id %in% c(fgseaRes1$`EOfAD-like/+` %>% 
                     dplyr::filter(pathway == "KEGG_PATHWAYS_IN_CANCER") %>% 
                     .$leadingEdge %>% unlist) ~ "TRUE"),
    `fAI-like/+` = case_when(
      gene_id %in% c(fgseaRes1$`fAI-like/+` %>% dplyr::filter(pathway == "KEGG_PATHWAYS_IN_CANCER") %>% .$leadingEdge %>% unlist) ~ "TRUE"
    ) 
    )%>% 
  replace_na(list(`EOfAD-like/+` = FALSE, `fAI-like/+` = FALSE)) %>% 
  inner_join(x$genes) %>% 
  dplyr::distinct(gene_name, .keep_all = TRUE) %>% 
  dplyr::select(gene_name, `EOfAD-like/+`, `fAI-like/+`) %>% 
  column_to_rownames("gene_name") 

#png("plots/KEGG_PATHWAYS_IN_CANCER.png", width = 35, height = 15, units = "cm", res = 600)
toptable_1 %>% 
  bind_rows() %>% 
  dplyr::filter(gene_id %in% KEGG$KEGG_PATHWAYS_IN_CANCER) %>% 
  dplyr::select(gene_name, coef, logFC) %>% 
  dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_name") %>% 
  t() %>% 
  pheatmap(
    #cellwidth = 10, 
    cellheight = 10, 
    show_colnames = F,
    treeheight_row = 0, treeheight_col = 80,
    breaks = seq(-max(abs(.)),  max(abs(.)), length.out = 100), 
    color = colorRampPalette(rev(brewer.pal(n = 7, 
                                            name = "RdBu")))(100), 
    main = "KEGG_PATHWAYS_IN_CANCER" , 
    annotation_col = annocol_2, 
    width = 35,
    annotation_colors = list(
      `EOfAD-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"), 
      `fAI-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"))
  )

#dev.off()  

Does the downregulation of psen1 due to the FS mutation drive the enrichment of certain gene sets?

fryRes_nopsen1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE)[!grepl(rownames(cpm(x$counts)), pattern = "ENSDARG00000004870"),] %>%     fry(
      index = c(KEGG, ireGenes),
        design = design, 
        contrast = y, 
        sort = "mixed"
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y)
  }, simplify = FALSE)

cameraRes_nopsen1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE)[!grepl(rownames(cpm(x$counts)), pattern = "ENSDARG00000004870"),] %>%
      camera(
        index = c(KEGG, ireGenes),
        design = design, 
        contrast = y, 
        inter.gene.cor = NA, 
        sort = TRUE
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y, 
             sig = FDR < 0.05)
  }, simplify = FALSE)

ranks_fgsea_nopsen1 <- 
   sapply(toptable_1, function(x) {
     x %>% 
       dplyr::filter(gene_id != "ENSDARG00000004870") %>% 
       mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>% 
       arrange(PValue_withsign) %>% 
       dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
       with(structure(PValue_withsign, names = gene_id)) %>% 
       rev() # reverse so the start of the list is upregulated genes
   }, simplify = FALSE)

set.seed(1)
fgseaRes_nopsen1 <- ranks_fgsea_nopsen1 %>%
  lapply(function(y){
    fgseaMultilevel(stats = y, pathways = c(KEGG, ireGenes), eps = 0) %>%
      as_tibble() %>%
      dplyr::rename(FDR = padj) %>%
      mutate(padj = p.adjust(pval, "bonferroni")) %>%
      dplyr::select(pathway, pval, FDR, padj, everything()) %>%
      arrange(pval) %>%
      mutate(sig = padj < 0.05)
  })

fgseaRes_nopsen1$`EOfAD-like/+` %<>% 
  mutate(coef = "EOfAD-like/+" )

fgseaRes_nopsen1$`fAI-like/+` %<>% 
  mutate(coef = 'fAI-like/+')

HMP_nopsen1 <- fryRes_nopsen1 %>% 
  bind_rows() %>% 
  dplyr::select(pathway, PValue.Mixed, coef) %>% 
  dplyr::rename(fry_p = PValue.Mixed) %>% 
  left_join(cameraRes_nopsen1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, PValue, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(camera_p = PValue) %>% 
  left_join(fgseaRes_nopsen1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, pval, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(fgsea_p = pval) %>% 
  bind_rows() %>% 
  nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>% 
 mutate(harmonic_p = vapply(p, function(x){
        x <- unlist(x)
        x <- x[!is.na(x)]
        p.hmp(x, L = 4)
      }, numeric(1))
    ) %>% 
  unnest() %>% 
  mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
         sig = harmonic_p_FDR < 0.05) %>% 
  arrange(harmonic_p) 

HMP_nopsen1 %>% 
  dplyr::filter(harmonic_p_FDR < 0.05) %>% 
  ggplot(aes(x = coef, y = pathway)) +
  geom_point(size = 4) +
  geom_label(aes(label = signif(harmonic_p_FDR, digits = 2)))

analysis of notch targets

notchTargets <-
  msigdbr("Danio rerio", category = "C6") %>%
  dplyr::filter(gs_name %in% c("NOTCH_DN.V1_UP", "NOTCH_DN.V1_DN","NGUYEN_NOTCH1_TARGETS_UP", "NGUYEN_NOTCH1_TARGETS_DN")) %>%
  rbind(msigdbr("Danio rerio", category = "C2") %>%
          dplyr::filter(gs_name %in% c("RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP"))
  ) %>%
  rbind(msigdbr("Danio rerio", category = "C2") %>%
          dplyr::filter(gs_name %in% c("NGUYEN_NOTCH1_TARGETS_UP", "NGUYEN_NOTCH1_TARGETS_DN"))
        ) %>%
  left_join(x$genes, by = c("gene_symbol" = "gene_name")) %>%
  dplyr::filter(!is.na(gene_id)) %>%
  distinct(gs_name, gene_id, .keep_all = TRUE) %>%
  split(f = .$gs_name) %>%
  lapply(extract2, "gene_id")


fryRes_notch <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE) %>% 
      fry(
        index = notchTargets,
        design = design, 
        contrast = y, 
        sort = "mixed"
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y)
  }, simplify = FALSE)

cameraRes_notch <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    x %>% 
      cpm(log = TRUE) %>% 
      camera(
        index = notchTargets,
        design = design, 
        contrast = y, 
        inter.gene.cor = NA, 
        sort = TRUE
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y, 
             #sig = FDR < 0.05
             )
  }, simplify = FALSE)


set.seed(33)
fgseaRes_notch <- ranks_fgsea1 %>%
  lapply(function(x){
    fgseaMultilevel(stats = x, pathways = notchTargets) %>%
      as_tibble() %>%
      dplyr::rename(FDR = padj) %>%
      mutate(padj = p.adjust(pval, "bonferroni")) %>%
      dplyr::select(pathway, pval, FDR, padj, everything()) %>%
      arrange(pval) %>%
      mutate(sig = padj < 0.05)
  })

fgseaRes_notch$`EOfAD-like/+` %<>% 
  mutate(coef = "EOfAD-like/+" )

fgseaRes_notch$`fAI-like/+` %<>% 
  mutate(coef = 'fAI-like/+')

HMP_notch <- fryRes_notch %>% 
  bind_rows() %>% 
  dplyr::select(pathway, PValue, coef) %>% 
  dplyr::rename(fry_p = PValue) %>% 
  left_join(cameraRes_notch %>% 
              bind_rows() %>% 
              dplyr::select(pathway, PValue, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(camera_p = PValue) %>% 
  left_join(fgseaRes_notch %>% 
              bind_rows() %>% 
              dplyr::select(pathway, pval, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(fgsea_p = pval) %>% 
  bind_rows() %>% 
  nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>% 
 mutate(harmonic_p = vapply(p, function(x){
        x <- unlist(x)
        x <- x[!is.na(x)]
        p.hmp(x, L = 4)
      }, numeric(1))
    ) %>% 
  unnest() %>% 
  mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
         sig = harmonic_p_FDR < 0.05) %>% 
  arrange(harmonic_p) 

HMP_notch
## # A tibble: 10 x 8
##    pathway       coef     fry_p camera_p fgsea_p harmonic_p harmonic_p_FDR sig  
##    <chr>         <chr>    <dbl>    <dbl>   <dbl>      <dbl>          <dbl> <lgl>
##  1 RYAN_MANTLE_… EOfAD… 0.00968  0.00810 2.23e-4   0.000642        0.00642 TRUE 
##  2 RYAN_MANTLE_… fAI-l… 0.0304   0.0491  5.10e-4   0.00151         0.00756 TRUE 
##  3 NOTCH_DN.V1_… EOfAD… 0.0659   0.208   4.80e-2   0.105           0.349   FALSE
##  4 NGUYEN_NOTCH… fAI-l… 0.148    0.487   6.23e-1   0.435           0.612   FALSE
##  5 NOTCH_DN.V1_… EOfAD… 0.163    0.942   5.38e-1   0.475           0.612   FALSE
##  6 NOTCH_DN.V1_… fAI-l… 0.166    0.712   8.10e-1   0.488           0.612   FALSE
##  7 NOTCH_DN.V1_… fAI-l… 0.228    0.655   6.60e-1   0.528           0.612   FALSE
##  8 NGUYEN_NOTCH… EOfAD… 0.297    0.660   9.16e-1   0.578           0.612   FALSE
##  9 NGUYEN_NOTCH… fAI-l… 0.321    0.788   7.27e-1   0.585           0.612   FALSE
## 10 NGUYEN_NOTCH… EOfAD… 0.407    0.860   7.53e-1   0.612           0.612   FALSE
# 
# annocol <- notchTargets$RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP %>% 
#   as.data.frame() %>% 
#   set_colnames("gene_id") %>% 
#   mutate(`EOfAD-like/+` = case_when(
#     gene_id %in% fgseaRes_notch$`EOfAD-like/+`$leadingEdge[[1]] ~ "TRUE"),
#     `fAI-like/+` = case_when(
#       gene_id %in% fgseaRes_notch$`fAI-like/+`$leadingEdge[[1]] ~ "TRUE"
#     )  
#   ) %>% 
#   replace_na(list(`EOfAD-like/+` = FALSE, `fAI-like/+` = FALSE)) %>% 
#   inner_join(x$genes) %>% 
#   dplyr::distinct(gene_name, .keep_all = TRUE) %>% 
#   dplyr::select(gene_name, `EOfAD-like/+`, `fAI-like/+`) %>% 
#   column_to_rownames("gene_name") 
# 
# png("plots/ryanetalPheatmap.png", width = 60, height = 20, units = "cm", res = 600)
# toptable_1 %>% 
#   bind_rows() %>% 
#   dplyr::filter(gene_id %in% notchTargets$RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP) %>% 
#   dplyr::select(gene_name, coef, logFC) %>% 
#   dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
#   spread(key = "coef", value = "logFC") %>% 
#   column_to_rownames("gene_name") %>% 
#   t() %>% 
#   pheatmap(cellwidth = 10, cellheight = 10,
#            treeheight_row = 0, treeheight_col = 80,
#            breaks = seq(-max(abs(.)),  max(abs(.)), length.out = 100), 
#            color = colorRampPalette(rev(brewer.pal(n = 7, 
#                                                    name = "RdBu")))(100), 
#            main = "RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP" , 
#            annotation_col = annocol, 
#            annotation_colors = list(
#              `EOfAD-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"), 
#              `fAI-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"))
#            )
# dev.off()  
#   
# fgseaRes_notch %>% 
#   lapply(function(y) {
#     y %>% 
#       dplyr::filter(pathway == "RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP") %>% 
#       dplyr::select(leadingEdge)
#   }) %>% 
#   unlist %>% 
#   as.data.frame() %>% 
#   rownames_to_column("coef") %>% 
#   set_colnames(c("coef", "gene_id")) %>% 
#   mutate(coef = str_remove(coef, pattern = ".leadingEdge[0-9]+")) %>% 
#   split(f = .$coef) %>% 
#   lapply(magrittr::extract2, "gene_id") %>% 
#   UpSetR::fromList() %>% 
#   UpSetR::upset(order.by = "freq")
# 
# toptable_1 %>% 
#   bind_rows() %>% 
#   dplyr::filter(gene_id %in% intersect(fgseaRes_notch$`EOfAD-like/+`[1,]$leadingEdge[[1]], 
#                                        fgseaRes_notch$`fHS-like/+`[1,]$leadingEdge[[1]])) %>% 
#   dplyr::select(gene_name, coef, logFC) %>% 
#   dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
#   spread(key = "coef", value = "logFC") %>% 
#   column_to_rownames("gene_name") %>% 
#   pheatmap(cluster_cols = FALSE, clustering_method = 'average',
#            breaks = seq(-max(abs(.)),  max(abs(.)), length.out = 100), 
#            color = colorRampPalette(rev(brewer.pal(n = 7, 
#                                                    name = "RdBu")))(100), 
#            main = "Ryan et al. leading edge shared genes")
# 
# 
# toptable_1 %>% 
#   bind_rows() %>% 
#   dplyr::filter(gene_id %in% fgseaRes_notch$`EOfAD-like/+`[1,]$leadingEdge[[1]])%>% 
#   dplyr::select(gene_name, coef, logFC) %>% 
#   dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
#   spread(key = "coef", value = "logFC") %>% 
#   column_to_rownames("gene_name") %>% 
#   pheatmap(cluster_cols = FALSE, clustering_method = 'average',
#            breaks = seq(-max(abs(.)),  max(abs(.)), length.out = 100), 
#            color = colorRampPalette(rev(brewer.pal(n = 7, 
#                                                    name = "RdBu")))(100), 
#            main = "Ryan et al. leading edge shared genes")
#                   
# toptable_1 %>% 
#   bind_rows() %>% 
#   dplyr::filter(gene_id %in% fgseaRes_notch$`fHS-like/+`[1,]$leadingEdge[[1]])%>% 
#   dplyr::select(gene_name, coef, logFC) %>% 
#   dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>% 
#   spread(key = "coef", value = "logFC") %>% 
#   column_to_rownames("gene_name") %>% 
#   pheatmap(cluster_cols = FALSE, clustering_method = 'average',
#            breaks = seq(-max(abs(.)),  max(abs(.)), length.out = 100), 
#            color = colorRampPalette(rev(brewer.pal(n = 7, 
#                                                    name = "RdBu")))(100), 
#            main = "Ryan et al. leading edge shared genes")

design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE) %>% 
      mroast(
        index = notchTargets,
        design = design, 
        contrast = y, 
        sort = "mixed"
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y)
  }, simplify = FALSE) %>% 
  bind_rows() %>% 
  mutate(Upregulated = NGenes*PropUp, 
         Downregulated = NGenes*PropDown, 
         ns = NGenes - Upregulated - Downregulated) %>% 
  left_join(HMP_notch) %>% 
  dplyr::select(pathway, Upregulated, Downregulated, ns, coef, harmonic_p, sig) %>% 
  gather(key = "Direction", value = "NGenes", c("Upregulated", "Downregulated", "ns")) %>% 
  mutate(Direction = factor(Direction, levels = c("ns", "Downregulated", "Upregulated"))) %>% 
  ggplot(aes(x = pathway)) + 
  geom_col(aes(y = NGenes, fill = Direction, alpha = sig)) +
  # geom_text(aes(label = signif(harmonic_p, 2), y = NGenes-10), 
  #            data = . %>% 
  #              dplyr::distinct(coef, pathway, .keep_all = TRUE)
  # ) +
  coord_flip() +
  scale_alpha_manual(values = c(0.5, 1)) +
  scale_fill_manual(values = c("grey50", "#2166AC", "#B2182B")) +
  scale_color_manual(values = c("NA", "black")) +
  theme(legend.position = "bottom") +
  facet_wrap(~coef) +
  labs(y = "Number of genes", 
       x = "Notch gene set", 
       fill = "Direction" , 
       alpha = "FDR-adjusted\nHMP< 0.05") +
  theme_bw() 

  #ggsave("plots/notchTargets.png", width = 20, height = 10, units = "cm", dpi = 600, scale = 1.3)

plot psen2 expression

psen2_gene_id <- "ENSDARG00000015540"

cpm(x, log = TRUE) %>% 
  as.data.frame() %>% 
  .[psen2_gene_id,] %>% 
  gather(key = "sample", value = "logCPM") %>% 
  left_join(x$samples) %>% 
  ggplot(aes(x = Genotype, y = logCPM, fill = Genotype)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(shape = Sex), 
              size = 3,  colour = "black") +
  theme_bw() +
  theme(aspect.ratio = 1) +
  easy_rotate_x_labels(angle = -45) +
  labs(x = "Genotype", 
       fill = "Genotype") +
    stat_compare_means(comparisons = list(c("WT", "EOfAD-like/+"),
                                          c("WT", "fAI-like/+")))

  #ggsave("plots/psen2expression.png", width = 10, height = 10, units = "cm", dpi = 600, scale = 1.5)

cell type check

# This code loads the function in the working environment
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

cell_type_markers <- read_xls("gene_sets/cell_type_genes41586_2007_BFnature05453_MOESM11_ESM.xls") %>%
  as_tibble() %>%
 dplyr::select(c("Neuron-enriched", "Astrocyte-enriched", "Oligodendrocyte-enriched")) %>%
  gather(key = "cell_type", value = "gene_name")

# these are in mouse, so want to convert to zebrafish enselbl id.
# went and downloaded this from biomart website
mm2zfID <- read_tsv("gene_sets/mouse_geneName_to_zf_ID.txt") %>%
  as_tibble() %>%
  set_colnames(c("m_geneID", "gene_name", "gene_id"))

cell_type_markers <- left_join(cell_type_markers, mm2zfID) %>%
  na.omit() %>%
  dplyr::select(c("cell_type", "gene_id"))

# microglia
microglia <- read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 1) %>%
  rbind(read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 2)) %>%
  rbind(read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 3)) %>%
  .$Zebrafish.Ensembl.Gene.ID %>%
  unique

ggarrange(
  #Neurons
x %>%
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% c(cell_type_markers %>%
                                 dplyr::filter(cell_type == "Neuron-enriched") %>%
                                 .$gene_id)
  ) %>%
  gather(key = "sample", value = "logCPM", colnames(x)) %>%
  left_join(x$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_flat_violin( 
    aes(fill = Genotype),
    position = position_nudge(x = 0.2, y = 0),
    alpha = 0.75,
    trim = FALSE) +
  geom_point(
    aes(colour = Genotype), 
    position = position_jitter(width = 0.15), size = 1, alpha = 0.1 ) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Neurons") +
  coord_flip() +
  stat_compare_means(method =  'anova',label = "p.signif", label.y = 15, label.x = "EOfAD-like/+")
  ,
# Oligodendrocytes

x %>%
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% c(cell_type_markers %>%
                                 dplyr::filter(cell_type == "Oligodendrocyte-enriched") %>%
                                 .$gene_id)
  ) %>%
  gather(key = "sample", value = "logCPM", colnames(x)) %>%
  left_join(x$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_flat_violin( 
    aes(fill = Genotype),
    position = position_nudge(x = 0.2, y = 0),
    alpha = 0.75,
    trim = FALSE) +
  geom_point(
    aes(colour = Genotype), 
    position = position_jitter(width = 0.15), size = 1, alpha = 0.1 ) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Oligodendrocytes") +
  stat_compare_means(method =  'anova',label = "p.signif", label.y = 15, label.x = "EOfAD-like/+") +
  coord_flip(),
#Astrocytes
x %>%
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% c(cell_type_markers %>%
                                 dplyr::filter(cell_type == "Astrocyte-enriched") %>%
                                 .$gene_id)
  ) %>%
  gather(key = "sample", value = "logCPM", colnames(x)) %>%
  left_join(x$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_flat_violin( 
    aes(fill = Genotype),
    position = position_nudge(x = 0.2, y = 0),
    alpha = 0.75,
    trim = FALSE) +
  geom_point(
    aes(colour = Genotype), 
    position = position_jitter(width = 0.15), size = 1, alpha = 0.25 ) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Astrocytes") +
  coord_flip() +
  stat_compare_means(method =  'anova', label = "p.signif", label.y = 15, label.x = "EOfAD-like/+"),

  # microglia
  x %>%
    cpm(log=TRUE) %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    dplyr::filter(gene_id %in% microglia) %>%
  gather(key = "sample", value = "logCPM", colnames(x)) %>%
  left_join(x$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_flat_violin( 
    aes(fill = Genotype),
    position = position_nudge(x = 0.2, y = 0),
    alpha = 0.75,
    trim = FALSE) +
  geom_point(
    aes(colour = Genotype), 
    position = position_jitter(width = 0.15), size = 1, alpha = 0.25 ) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  coord_flip() +
  stat_compare_means(method =  'anova', label = "p.signif", label.y = 15, label.x = "EOfAD-like/+") +
  ggtitle("Microglia"),
legend = "none",
labels = "AUTO"
) +
  ggsave("plots/cellTypeCheck.tiff", width = 22, height = 18, units = "cm", dpi = 600)

cell_type_markers %<>% split(f = .$cell_type) %>% lapply(extract2, "gene_id") 

cell_type_markers$Microglia <- microglia

design %>% colnames() %>% .[2:3] %>% 
  sapply(function(y) {
    cpm(x$counts, log = TRUE) %>% 
      fry(
        index = cell_type_markers,
        design = design, 
        contrast = y, 
        sort = "mixed"
      ) %>% 
      rownames_to_column("pathway") %>% 
      as_tibble() %>% 
      mutate(coef = y)
  }, simplify = FALSE)
## $`EOfAD-like/+`
## # A tibble: 4 x 8
##   pathway         NGenes Direction PValue    FDR PValue.Mixed FDR.Mixed coef    
##   <chr>            <int> <chr>      <dbl>  <dbl>        <dbl>     <dbl> <chr>   
## 1 Microglia          432 Up        0.0201 0.0802        0.148     0.594 EOfAD-l…
## 2 Astrocyte-enri…     40 Up        0.220  0.440         0.415     0.636 EOfAD-l…
## 3 Oligodendrocyt…     86 Up        0.694  0.694         0.489     0.636 EOfAD-l…
## 4 Neuron-enriched     71 Down      0.666  0.694         0.636     0.636 EOfAD-l…
## 
## $`fAI-like/+`
## # A tibble: 4 x 8
##   pathway           NGenes Direction PValue   FDR PValue.Mixed FDR.Mixed coef   
##   <chr>              <int> <chr>      <dbl> <dbl>        <dbl>     <dbl> <chr>  
## 1 Microglia            432 Up        0.0745 0.252        0.196     0.317 fAI-li…
## 2 Astrocyte-enrich…     40 Up        0.126  0.252        0.221     0.317 fAI-li…
## 3 Neuron-enriched       71 Down      0.799  0.799        0.313     0.317 fAI-li…
## 4 Oligodendrocyte-…     86 Up        0.471  0.628        0.317     0.317 fAI-li…

Compare with Q96_K97del

comp_w_Q96 <- read_rds("~/Documents/EOfAD_signature/R_objects/psen1Q96_HMP.rds") %>% 
  mutate(coef = "Q96_K97del/+") %>% 
  bind_rows(HMP_1 %>% 
              dplyr::select(pathway, coef, harmonic_p, harmonic_p_FDR))

sig_all <- comp_w_Q96 %>% 
  dplyr::filter(harmonic_p_FDR < 0.05) %>% 
  .$pathway %>% 
  unique


sig_in_all <- comp_w_Q96 %>% 
  mutate(sig = harmonic_p_FDR < 0.05) %>% 
  dplyr::select(pathway, sig, coef) %>% 
  spread(key = 'coef', value = 'sig') %>% 
  mutate(sig_in = case_when(
    
    `EOfAD-like/+` == TRUE & `fAI-like/+` == TRUE & `Q96_K97del/+` == TRUE ~ 'all', 
    
    `EOfAD-like/+` == "FALSE" & `fAI-like/+` == TRUE & `Q96_K97del/+` == "FALSE" ~ 'fHS only', 
    
    `EOfAD-like/+` == TRUE & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == "FALSE" ~ "T428del only",
    
    `EOfAD-like/+` == "FALSE" & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == TRUE ~ "Q96 only",
    
    `EOfAD-like/+` == "TRUE" & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == TRUE ~ "b_Q96 and T428 only",
    
    `EOfAD-like/+` == TRUE & `fAI-like/+` == TRUE & `Q96_K97del/+` == "FALSE" ~ "fHS and T428 only",
    TRUE ~ 'neither'
  )) %>% 
  dplyr::select(pathway, sig_in)

comp_w_Q96 %>% 
  dplyr::filter(grepl(pathway, pattern = "KEGG|^ire")) %>% 
  dplyr::filter(pathway %in% sig_all) %>% 
  left_join(sig_in_all) %>% 
  dplyr::filter(sig_in != "fHS only") %>% 
  mutate(sig = harmonic_p_FDR < 0.05) %>% 
  ggplot(aes(x = coef, y = pathway)) +
  geom_tile(aes(fill = -log10(harmonic_p)
                )
            ) +
  geom_tile(aes(alpha = harmonic_p_FDR > 0.05), show.legend = FALSE) +
  geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)), 
            fontface = "bold") +
  geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)), 
            data = . %>% dplyr::filter(harmonic_p_FDR > 0.05), 
            colour = "white", 
            fontface = "bold"
            ) +
  theme_bw() +
  theme(panel.grid = element_blank(), 
        panel.border = element_blank(), 
        strip.background = element_blank(), 
        strip.text = element_blank()
          ) + 
  scale_fill_viridis_c(option = 'viridis') +
  facet_grid(rows = vars(sig_in), scales = 'free_y', shrink = FALSE, space = 'free_y')  +
  labs(x = "", 
       y = "Gene set") 

  #ggsave("plots/comp_w_Q96.png", width = 10, height = 10, units = "cm", dpi = 800, scale = 2)

DATA EXPORT

saveRDS(x, "R_objects/dge.rds")
saveRDS(toptable_1, "R_objects/toptable_raw.rds")
# make a copy for export 
toptable_forExport  <- toptable_1
names(toptable_forExport) <- c("EOfADLike", "fHSLike")

writexl::write_xlsx(toptable_forExport, path = "R_objects/toptable_raw.xlsx")
writexl::write_xlsx(HMP_1, path = "R_objects/HMP_raw.xlsx")

saveRDS(HMP_1, "R_objects/HMP_raw_KEGG_IRE.rds")

session info

sessionInfo() %>% 
  pander()

R version 4.0.2 (2020-06-22)

Platform: x86_64-apple-darwin17.0 (64-bit)

locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), kableExtra(v.1.3.1), pander(v.0.6.3), UpSetR(v.1.4.0), RColorBrewer(v.1.1-2), pathview(v.1.28.1), goseq(v.1.40.0), geneLenDataBase(v.1.24.0), BiasedUrn(v.1.07), harmonicmeanp(v.3.0), FMStable(v.0.1-2), fgsea(v.1.14.0), ggeasy(v.0.1.2), pheatmap(v.1.0.12), msigdbr(v.7.2.1), ggrepel(v.0.8.2), cowplot(v.1.1.0), ggpubr(v.0.4.0), scales(v.1.1.1), readxl(v.1.3.1), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.2.0.0), ngsReports(v.1.4.2), BiocGenerics(v.0.34.0), ggfortify(v.0.4.11), edgeR(v.3.30.3), limma(v.3.44.3), magrittr(v.2.0.1), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.2), tibble(v.3.0.4), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)

loaded via a namespace (and not attached): utf8(v.1.1.4), tidyselect(v.1.1.0), RSQLite(v.2.2.1), htmlwidgets(v.1.5.2), FactoMineR(v.2.3), grid(v.4.0.2), BiocParallel(v.1.22.0), munsell(v.0.5.0), statmod(v.1.4.35), DT(v.0.16), withr(v.2.3.0), colorspace(v.2.0-0), highr(v.0.8), knitr(v.1.30), rstudioapi(v.0.13), leaps(v.3.1), ggsignif(v.0.6.0), labeling(v.0.4.2), KEGGgraph(v.1.48.0), GenomeInfoDbData(v.1.2.3), hwriter(v.1.3.2), farver(v.2.0.3), bit64(v.4.0.5), rhdf5(v.2.32.4), vctrs(v.0.3.5), generics(v.0.1.0), xfun(v.0.19), R6(v.2.5.0), locfit(v.1.5-9.4), bitops(v.1.0-6), DelayedArray(v.0.14.1), assertthat(v.0.2.1), promises(v.1.1.1), gtable(v.0.3.0), rlang(v.0.4.9), scatterplot3d(v.0.3-41), splines(v.4.0.2), rtracklayer(v.1.48.0), rstatix(v.0.6.0), lazyeval(v.0.2.2), broom(v.0.7.2), BiocManager(v.1.30.10), yaml(v.2.2.1), reshape2(v.1.4.4), abind(v.1.4-5), modelr(v.0.1.8), backports(v.1.2.0), httpuv(v.1.5.4), tools(v.4.0.2), ellipsis(v.0.3.1), ggdendro(v.0.1.22), Rcpp(v.1.0.5), plyr(v.1.8.6), progress(v.1.2.2), zlibbioc(v.1.34.0), RCurl(v.1.98-1.2), prettyunits(v.1.1.1), openssl(v.1.4.3), zoo(v.1.8-8), SummarizedExperiment(v.1.18.2), haven(v.2.3.1), cluster(v.2.1.0), fs(v.1.5.0), data.table(v.1.13.2), openxlsx(v.4.2.3), reprex(v.0.3.0), truncnorm(v.1.0-8), ProtGenerics(v.1.20.0), matrixStats(v.0.57.0), hms(v.0.5.3), mime(v.0.9), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.5), rio(v.0.5.16), jpeg(v.0.1-8.1), gridExtra(v.2.3), compiler(v.4.0.2), biomaRt(v.2.44.4), writexl(v.1.3.1), crayon(v.1.3.4), htmltools(v.0.5.0), mgcv(v.1.8-33), later(v.1.1.0.1), lubridate(v.1.7.9.2), DBI(v.1.1.0), MASS(v.7.3-53), rappdirs(v.0.3.1), ShortRead(v.1.46.0), Matrix(v.1.2-18), car(v.3.0-10), cli(v.2.2.0), pkgconfig(v.2.0.3), flashClust(v.1.01-2), GenomicAlignments(v.1.24.0), foreign(v.0.8-80), plotly(v.4.9.2.1), xml2(v.1.3.2), webshot(v.0.5.2), XVector(v.0.28.0), rvest(v.0.3.6), digest(v.0.6.27), graph(v.1.66.0), Biostrings(v.2.56.0), rmarkdown(v.2.5), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.3), shiny(v.1.5.0), Rsamtools(v.2.4.0), lifecycle(v.0.2.0), nlme(v.3.1-150), jsonlite(v.1.7.1), Rhdf5lib(v.1.10.1), carData(v.3.0-4), viridisLite(v.0.3.0), askpass(v.1.1), fansi(v.0.4.1), pillar(v.1.4.7), lattice(v.0.20-41), KEGGREST(v.1.28.0), fastmap(v.1.0.1), httr(v.1.4.2), GO.db(v.3.11.4), interactiveDisplayBase(v.1.26.3), glue(v.1.4.2), zip(v.2.1.1), png(v.0.1-7), BiocVersion(v.3.11.1), bit(v.4.0.4), Rgraphviz(v.2.32.0), stringi(v.1.5.3), blob(v.1.2.1), org.Hs.eg.db(v.3.11.4), latticeExtra(v.0.6-29) and memoise(v.1.1.0)